insurance <- read.csv("insurance_cost.csv")
str(insurance)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
library(dplyr)
library(ggplot2)
library(plotly)
library(ggbiplot)
library(ggpubr)
library(corrplot)
library(corrr)
library(caret)
library(factoextra)
#График ИМТ-траты в plotly (№2)
plot_ly(
data = insurance,
x = ~ bmi,
y = ~ charges,
color = ~ smoker)
plot <- insurance %>%
ggplot(aes(x = bmi, y = charges, color = smoker)) +
geom_point(size = 1.5) +
theme_light()
ggplotly(plot)
insurance_for_cor <- insurance %>%
select(is.integer | is.numeric)
## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.integer)
##
## # Now:
## data %>% select(where(is.integer))
## Warning: Use of bare predicate functions was deprecated in tidyselect 1.1.0.
## ℹ Please use wrap predicates in `where()` instead.
## # Was:
## data %>% select(is.numeric)
##
## # Now:
## data %>% select(where(is.numeric))
insurance_cor <- cor(insurance_for_cor)
insurance_cor
## age children bmi charges
## age 1.0000000 0.04246900 0.1092719 0.29900819
## children 0.0424690 1.00000000 0.0127589 0.06799823
## bmi 0.1092719 0.01275890 1.0000000 0.19834097
## charges 0.2990082 0.06799823 0.1983410 1.00000000
corrplot(insurance_cor, method = "color", order = "alphabet", type = "upper")
corrplot.mixed(insurance_cor, lower = "color", upper = "pie", order = "AOE")
insurance_cor %>%
rplot()
# Сначала отберем все номинативные переменные и превратим их в бинарные
dummy <- dummyVars(" ~ sex + smoker + region", data = insurance)
dummy_insurance <- data.frame(predict(dummy, newdata = insurance))
# И объединим с оставшимися нумерическими переменными из оригинального датафрейма
other_insurance <- insurance %>%
select("age", "bmi", "children", "charges")
new_insurance <- other_insurance %>% bind_cols(dummy_insurance)
glimpse(new_insurance)
## Rows: 1,338
## Columns: 12
## $ age <int> 19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56…
## $ bmi <dbl> 27.900, 33.770, 33.000, 22.705, 28.880, 25.740, 33.440…
## $ children <int> 0, 1, 3, 0, 0, 0, 1, 3, 2, 0, 0, 0, 0, 0, 0, 1, 1, 0, …
## $ charges <dbl> 16884.924, 1725.552, 4449.462, 21984.471, 3866.855, 37…
## $ sexfemale <dbl> 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, …
## $ sexmale <dbl> 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, …
## $ smokerno <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, …
## $ smokeryes <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, …
## $ regionnortheast <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, …
## $ regionnorthwest <dbl> 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ regionsoutheast <dbl> 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, …
## $ regionsouthwest <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, …
# Стандартизуем значения переменных
insurance_scaled <- scale(new_insurance)
# И найдем дистанции
insurance_dist <- dist(insurance_scaled, method = "euclidean")
as.matrix(insurance_dist)[1:6, 1:6]
## 1 2 3 4 5 6
## 1 0.000000 5.825239 6.253322 5.747217 5.759522 4.978144
## 2 5.825239 0.000000 1.823634 4.289327 3.582563 3.361726
## 3 6.253322 1.823634 0.000000 4.663256 4.148789 3.956548
## 4 5.747217 4.289327 4.663256 0.000000 1.807952 4.583438
## 5 5.759522 3.582563 4.148789 1.807952 0.000000 4.329507
## 6 4.978144 3.361726 3.956548 4.583438 4.329507 0.000000
# Высчитываем дендрограмму
insurance_hc <- hclust(d = insurance_dist,
method = "ward.D2")
# И визуализируем
fviz_dend(insurance_hc, cex = 0.1)